Setup the Seurat Object
library(dplyr)
library(Seurat)
library(patchwork)
# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "/home/lambi/Desktop/studia/ModZlo/lab7/fresh68/hg19/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc68k", min.cells = 3, min.features = 200)
pbmc
## An object of class Seurat
## 17788 features across 68551 samples within 1 assay
## Active assay: RNA (17788 features, 0 variable features)
## 1 layer present: counts
# shrink the dataset - ram problems
set.seed(42)
cells_20k <- sample(colnames(pbmc), size = 20000)
pbmc <- subset(pbmc, cells = cells_20k)
QC and selecting cells for further analysis
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

# percentile cut-off
max_features <- quantile(pbmc$nFeature_RNA, 0.99)
max_counts <- quantile(pbmc$nCount_RNA, 0.99)
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < max_features & nCount_RNA < max_counts &
percent.mt < 5)
Normalizing the data
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
Identification of highly variable features (feature selection)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2

Scaling the data
pbmc <- ScaleData(pbmc)
Perform linear dimensional reduction
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc), npcs = 30)
# Examine and visualize PCA results a few different ways
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: CST3, SERPINA1, LST1, RP11-290F20.3, SPI1
## Negative: RPS6, RPL13, B2M, RPS2, LTB
## PC_ 2
## Positive: NKG7, GNLY, CST7, GZMB, GZMA
## Negative: LTB, RPL13, RPS2, CD79A, RPS6
## PC_ 3
## Positive: RPS2, RPS6, RPL13, RPL34, AIF1
## Negative: CD79A, PDLIM1, TCL1A, MS4A1, GNG11
## PC_ 4
## Positive: CD79A, TCL1A, HLA-DPB1, NKG7, CD79B
## Negative: PPBP, PF4, SDPR, NRGN, TUBB1
## PC_ 5
## Positive: B2M, MS4A7, VMO1, RP11-290F20.3, CDKN1C
## Negative: S100A8, S100A9, LGALS2, CD14, CST3
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")

DimPlot(pbmc, reduction = "pca")

DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)

Determine the ‘dimensionality’ of the dataset
pbmc <- JackStraw(pbmc, num.replicate = 50)
pbmc <- ScoreJackStraw(pbmc, dims = 1:15) #thresolding
JackStrawPlot(pbmc, dims = 1:15)

ElbowPlot(pbmc)

Cluster the cells
pca.embeddings <- Embeddings(pbmc, "pca")[, 1:15]
kmeans.result <- kmeans(pca.embeddings, centers = 10)
pbmc$kmeans_clusters <- as.factor(kmeans.result$cluster)
Run non-linear dimensional reduction (UMAP/tSNE)
pbmc <- RunUMAP(pbmc, dims = 1:15)
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc, reduction = "umap")

Homework Problem 2
subpopulation_labels <- rep(NA, ncol(pbmc))
names(subpopulation_labels) <- colnames(pbmc)
for (cluster_id in unique(pbmc$kmeans_clusters)) {
cluster_cells <- WhichCells(pbmc, expression = kmeans_clusters == cluster_id)
pca_data <- Embeddings(pbmc, "pca")[cluster_cells, 1:10]
sil_widths <- sapply(2:6, function(k) {
km <- kmeans(pca_data, centers = k, nstart = 10)
sil <- cluster::silhouette(km$cluster, dist(pca_data))
mean(sil[, 3])
})
optimal_k <- which.max(sil_widths) + 1
km <- kmeans(pca_data, centers = optimal_k, nstart = 20)
cluster_labels <- paste0("Cluster", cluster_id, "_Sub", km$cluster)
names(cluster_labels) <- cluster_cells
subpopulation_labels[cluster_cells] <- cluster_labels
}
pbmc$subpopulation <- subpopulation_labels
library(ggplot2)
tsne_coords <- as.data.frame(Embeddings(pbmc, reduction = "umap"))
colnames(tsne_coords) <- c("UMAP_1", "UMAP_2")
tsne_coords$cluster <- pbmc$kmeans_clusters
tsne_coords$subpopulation <- pbmc$subpopulation
centers <- tsne_coords %>%
dplyr::group_by(cluster) %>%
dplyr::summarise(x = median(UMAP_1), y = median(UMAP_2))
ggplot(tsne_coords, aes(x = UMAP_1, y = UMAP_2, color = subpopulation)) + geom_point(size = 1.5,
alpha = 0.8) + geom_text(data = centers, aes(x = x, y = y, label = cluster), color = "black",
size = 5, fontface = "bold") + theme_classic() + labs(title = "t-SNE with subpopulations", x = "UMAP 1",
y = "UMAP 2")

DimPlot(pbmc, reduction = "umap", split.by = "kmeans_clusters", group.by = "subpopulation", ncol = 3)
